# Load necessary packages
library(dplyr)
library(ggplot2)
library(did)
library(readr)

# Load necessary data

load("00_data/01_data_processed/data_epa_analysis.Rdata")

# Preparing the data for modeling

# Set treatment year for analysis
epa_only$treatment_year_reg <- ifelse(epa_only$treatment_region == 0, 0, 1995)
epa_only$treatment_year_sta <- ifelse(epa_only$treatment_state == 0, 0, 1995)

final_panel_data$treatment_year_reg <- ifelse(final_panel_data$treatment_region == 0, 0, 1995)
final_panel_data$treatment_year_sta <- ifelse(final_panel_data$treatment_state == 0, 0, 1995)

# Convert PGM_SYS_ID to a numeric identifier
epa_only$PGM_SYS_ID_num <- as.numeric(as.factor(epa_only$PGM_SYS_ID))
final_panel_data$PGM_SYS_ID_num <- as.numeric(as.factor(final_panel_data$PGM_SYS_ID))

# Main analysis for EPA inspections using regional treatment year
set.seed(123)
model_epa_region <- att_gt(
  yname = "outcome_epa",
  tname = "year",
  idname = "PGM_SYS_ID_num",
  gname = "treatment_year_reg",
  data = epa_only[epa_only$year < 2002 & epa_only$year > 1989, ],
  alp = .01
)

# Aggregate treatment effects
aggte_model_epa_region_dynamic <- aggte(model_epa_region, type = "dynamic", na.rm = TRUE)
aggte_model_epa_region_simple <- aggte(model_epa_region, type = "simple", na.rm = TRUE)

# Plotting the results
plot_epa_region_dynamic <- ggdid(aggte_model_epa_region_dynamic)

plot_epa_region_dynamic$data %>%
  ggplot(aes(x = year, y = att,
             ymin = (att - c * att.se),
             ymax = (att + c * att.se))) +
  geom_point() +
  geom_errorbar(width = .1) +
  geom_vline(xintercept = -1, linetype = 3) +
  geom_vline(xintercept = 4, linetype = 3) +
  geom_hline(yintercept = 0, linetype = 1) +
  ylab("Estimate, 99% conf. band") +
  xlab("Years since program start") +
  theme_bw() +
  scale_x_continuous(breaks = -10:7) +
  scale_y_continuous(breaks = seq(-.30, .08, by = 0.04)) +
  coord_cartesian(ylim = c(-.25, .08))

ggsave("03_figures/figure4.pdf", height = 3, width = 8)
ggsave("03_figures/figure4.jpg", height = 3, width = 8, dpi=600)


# Main analysis for the entire dataset including all regions
set.seed(123)
model_epa_all_regions <- att_gt(
  yname = "outcome_epa",
  tname = "year",
  idname = "PGM_SYS_ID_num",
  gname = "treatment_year_reg",
  data = final_panel_data[final_panel_data$year < 2002 & final_panel_data$year > 1989, ],
  anticipation = 0,
  alp = .01
)

aggte_model_epa_all_dynamic <- aggte(model_epa_all_regions, type = "dynamic", na.rm = TRUE)
aggte_model_epa_all_simple <- aggte(model_epa_all_regions, type = "simple", na.rm = TRUE, max_e = 4)

plot_epa_all_dynamic <- ggdid(aggte_model_epa_all_dynamic)

plot_epa_all_dynamic$data %>%
  ggplot(aes(x = year, y = att,
             ymin = (att - c * att.se),
             ymax = (att + c * att.se))) +
  geom_point() +
  geom_errorbar(width = .1) +
  geom_vline(xintercept = -1, linetype = 3) +
  geom_vline(xintercept = 4, linetype = 3) +
  geom_hline(yintercept = 0, linetype = 1) +
  ylab("Estimate, 99% conf. band") +
  xlab("Years since program start") +
  theme_bw() +
  scale_x_continuous(breaks = -10:7) +
  scale_y_continuous(breaks = seq(-.03, .08, by = 0.004)) +
  coord_cartesian(ylim = c(-.03, .01))

ggsave("03_figures/appendix/figure_a3.pdf", height = 3, width = 8)

# Analysis for on-site and off-site inspections
set.seed(123)
model_epa_on_site <- att_gt(
  yname = "outcome_epa_on_site",
  tname = "year",
  idname = "PGM_SYS_ID_num",
  gname = "treatment_year_reg",
  data = epa_only[epa_only$year < 2002 & epa_only$year > 1989,],
  anticipation = 0,
  alp = .01
)

summary(model_epa_on_site)
ggdid(model_epa_on_site)
aggte_model_epa_on_site_dynamic <- aggte(model_epa_on_site, type = "dynamic", na.rm = TRUE)
aggte_model_epa_on_site_simple <- aggte(model_epa_on_site, type = "simple", na.rm = TRUE)

plot_epa_on_site_dynamic <- ggdid(aggte_model_epa_on_site_dynamic)
plot_epa_on_site_dynamic$data$name <- "On-site"

set.seed(123)
model_epa_off_site <- att_gt(
  yname = "outcome_epa_off_site",
  tname = "year",
  idname = "PGM_SYS_ID_num",
  gname = "treatment_year_reg",
  data = epa_only[epa_only$year < 2002 & epa_only$year > 1989,],
  alp = .01
)

aggte_model_epa_off_site_dynamic <- aggte(model_epa_off_site, type = "dynamic", na.rm = TRUE)
aggte_model_epa_off_site_simple <- aggte(model_epa_off_site, type = "simple", na.rm = TRUE)

plot_epa_off_site_dynamic <- ggdid(aggte_model_epa_off_site_dynamic)
plot_epa_off_site_dynamic$data$name <- "Off-site"

plot_epa_site_comparison <- bind_rows(plot_epa_on_site_dynamic$data, plot_epa_off_site_dynamic$data)

plot_epa_site_comparison %>%
  ggplot(aes(x = year, y = att,
             ymin = (att - c * att.se),
             ymax = (att + c * att.se),
             color = name)) +
  geom_pointrange(position = position_dodge2(width = .3), size = .1) +
  geom_vline(xintercept = -1, linetype = 3) +
  geom_vline(xintercept = 4, linetype = 3) +
  geom_hline(yintercept = 0, linetype = 1) +
  ylab("Estimate, 99% conf. band") +
  xlab("Years since program start") +
  theme_bw() +
  scale_x_continuous(breaks = -10:7) +
  scale_y_continuous(breaks = seq(-.30, .08, 0.04)) +
  coord_cartesian(ylim = c(-.25, .08)) +
  scale_color_manual(values = c("black", "blue")) +
  theme(
    legend.background = element_rect(fill = "white", linewidth = 0.2, linetype = "solid", colour = "black"),
    legend.position = c(0.9, 0.2),
    legend.title = element_blank()
  )

ggsave("03_figures/figure5.pdf", height = 3, width = 8)
ggsave("03_figures/figure5.jpg", height = 3, width = 8,dpi=600)


# Analysis for substitution with other inspections
set.seed(123)
model_non_epa_substitution <- att_gt(
  yname = "other_inspection",
  tname = "year",
  idname = "PGM_SYS_ID_num",
  gname = "treatment_year_reg",
  data = epa_only[epa_only$year < 2002 & epa_only$year > 1989,],
  anticipation = 0,
  alp = .01
)

aggte_model_non_epa_substitution_dynamic <- aggte(model_non_epa_substitution, type = "dynamic", na.rm = TRUE)
aggte_model_non_epa_substitution_simple <- aggte(model_non_epa_substitution, type = "simple", na.rm = TRUE)

plot_non_epa_substitution <- ggdid(aggte_model_non_epa_substitution_dynamic)

plot_non_epa_substitution$data %>%
  ggplot(aes(x = year, y = att,
             ymin = (att - c * att.se),
             ymax = (att + c * att.se))) +
  geom_point() +
  geom_errorbar(width = .1) +
  geom_vline(xintercept = -1, linetype = 3) +
  geom_vline(xintercept = 4, linetype = 3) +
  geom_hline(yintercept = 0, linetype = 1) +
  ylab("Estimate, 99% conf. band") +
  xlab("Years since program start") +
  theme_bw() +
  scale_x_continuous(breaks = -10:7) +
  scale_y_continuous(breaks = seq(-.30, .08, 0.04)) +
  coord_cartesian(ylim = c(-.25, .08))

ggsave("03_figures/figure6.pdf", height = 3, width = 8)
ggsave("03_figures/figure6.jpg", height = 3, width = 8, dpi=600)

# State treatment analysis
set.seed(123)
model_state_treatment <- att_gt(
  yname = "outcome_epa",
  tname = "year",
  idname = "PGM_SYS_ID_num",
  gname = "treatment_year_sta",
  data = epa_only[epa_only$year < 2002 & epa_only$year > 1989, ],
  anticipation = 0,
  alp = .01
)

aggte_model_state_treatment_dynamic <- aggte(model_state_treatment, type = "dynamic", na.rm = TRUE)
aggte_model_state_treatment_simple <- aggte(model_state_treatment, type = "simple", na.rm = TRUE)

plot_state_treatment_dynamic <- ggdid(aggte_model_state_treatment_dynamic)

plot_state_treatment_dynamic$data %>%
  ggplot(aes(x = year, y = att,
             ymin = (att - c * att.se),
             ymax = (att + c * att.se))) +
  geom_point() +
  geom_errorbar(width = .1) +
  geom_vline(xintercept = -1, linetype = 3) +
  geom_vline(xintercept = 4, linetype = 3) +
  geom_hline(yintercept = 0, linetype = 1) +
  ylab("Estimate, 99% conf. band") +
  xlab("Years since program start") +
  theme_bw() +
  scale_x_continuous(breaks = -10:7) +
  scale_y_continuous(breaks = seq(-.30, .08, 0.04)) +
  coord_cartesian(ylim = c(-.25, .08))

ggsave("03_figures/appendix/figure_a5.pdf", height = 3, width = 8)

# Subset analysis excluding specific sites
FRS_ids <- read_csv("00_data/01_data_processed/FRS_ids.csv")
phase1sights <- read_delim("00_data/01_data_processed/phase1sites.csv", delim = ";", escape_double = FALSE, trim_ws = TRUE)
phase1sights$CAMD_PLANT_ID <- phase1sights$ORISPL

phase1sights <- merge(phase1sights, FRS_ids, by = "CAMD_PLANT_ID", all.x = TRUE)

epa_only_excluded <- epa_only[!epa_only$REGISTRY_ID %in% phase1sights$FRS_ID, ]

set.seed(123)
model_excluded_sites <- att_gt(
  yname = "outcome_epa",
  tname = "year",
  idname = "PGM_SYS_ID_num",
  gname = "treatment_year_reg",
  data = epa_only_excluded[epa_only_excluded$year < 2002 & epa_only_excluded$year > 1989, ],
  anticipation = 0,
  alp = .01
)

aggte_model_excluded_sites_dynamic <- aggte(model_excluded_sites, type = "dynamic", na.rm = TRUE)
aggte_model_excluded_sites_simple <- aggte(model_excluded_sites, type = "simple", na.rm = TRUE)

plot_excluded_sites_dynamic <- ggdid(aggte_model_excluded_sites_dynamic)

plot_excluded_sites_dynamic$data %>%
  ggplot(aes(x = year, y = att,
             ymin = (att - c * att.se),
             ymax = (att + c * att.se))) +
  geom_point() +
  geom_errorbar(width = .1) +
  geom_vline(xintercept = -1, linetype = 3) +
  geom_vline(xintercept = 4, linetype = 3) +
  geom_hline(yintercept = 0, linetype = 1) +
  ylab("Estimate, 99% conf. band") +
  xlab("Years since program start") +
  theme_bw() +
  scale_x_continuous(breaks = -10:7) +
  scale_y_continuous(breaks = seq(-.30, .08, 0.04)) +
  coord_cartesian(ylim = c(-.25, .08))

ggsave("03_figures/appendix/figure_a4.pdf", height = 3, width = 8)

# Analysis with censoring variables
model_censored <- att_gt(
  yname = "outcome_epa",
  tname = "year",
  xformla = ~ is_censored + is_censored_first,
  idname = "PGM_SYS_ID_num",
  gname = "treatment_year_reg",
  data = epa_only[epa_only$year < 2002 & epa_only$year > 1989, ],
  anticipation = 0,
  alp = .01
)

aggte_model_censored_dynamic <- aggte(model_censored, type = "dynamic")

plot_censored_dynamic <- ggdid(aggte_model_censored_dynamic)

plot_censored_dynamic$data %>%
  ggplot(aes(x = year, y = att,
             ymin = (att - c * att.se),
             ymax = (att + c * att.se))) +
  geom_point() +
  geom_errorbar(width = .1) +
  geom_vline(xintercept = -1, linetype = 3) +
  geom_vline(xintercept = 4, linetype = 3) +
  geom_hline(yintercept = 0, linetype = 1) +
  ylab("Estimate, 99% conf. band") +
  xlab("Years since program start") +
  theme_bw() +
  scale_x_continuous(breaks = -10:7) +
  scale_y_continuous(breaks = seq(-.30, .08, 0.04)) +
  coord_cartesian(ylim = c(-.25, .08))

ggsave("03_figures/appendix/figure_a6.pdf", height = 3, width = 8)


save(aggte_model_epa_region_dynamic,
     aggte_model_epa_all_dynamic,
     aggte_model_epa_on_site_dynamic,
     aggte_model_epa_off_site_dynamic, 
     aggte_model_non_epa_substitution_dynamic,
     aggte_model_state_treatment_dynamic,
     aggte_model_excluded_sites_dynamic,
     aggte_model_censored_dynamic,
     file="00_data/01_data_processed/models_dynamic.Rdata")

rm(list = ls())
